Overview

This file simulates the data I anticipate for the coordinated analysis that will be my dissertation.

metadata <- tibble(
  study = c("new_moms", "deception_detection", "karyn_diss", "murat_rep", 
             "mideast_men", "stem", "barter", "double_empathy"),
  targets = c(20, 95, 212, 200, 
              9, 59, 310, 8),
  perceivers = c(60, 95, 212, 200,
                 326, 121, 310, 100),
  videos = c(20, 95, 318, 300,
             9, 121, 155, 8),
  paradigm = c("ss", "di", "di", "di",
               "ss", "di", "di", "ss") %>% 
    factor(levels = c("ss", "di"), labels = c("Standard Stimulus", 
           "Dyadic Interaction")),
  inference_schedule = c("Variable", "Variable", "Set", "Set",
                         "Variable", "Set", "Variable", "Set") %>% 
    as.factor()
) 
vrm <- c("Disclosure", "Edification", "Advisement", "Confirmation", "Question", "Acknowledgment", "Interpretation", "Reflection")
generate_random_number <- function(mean = 8, sd = 3, min = 3, max = 19, digits = 0) {
  random_number <- NA
  while (is.na(random_number) || random_number < min || random_number > max) {
    random_number <- round(rnorm(1, mean = mean, sd = sd), digits)
  }
  return(random_number)
}

multiply_out <- function(df, n_column, column_name) {
  df_expanded <- df %>%
    rowwise() %>%
    mutate(!!column_name := list(seq_len(!!sym(n_column)))) %>%
    unnest(cols = !!sym(column_name))
  
  return(df_expanded)
}
SimulateStudy <- function(study_name, paradigm, seed = 123, n_perceivers = 1, n_videos_per_perceiver = 1){
  set.seed(seed)
  # Filter for current study
  study_data <- metadata %>% 
    filter(study == study_name) 

  # Simulate number of chapters within each video
  df = tibble(
    name = paste0(study_name, "_", 1:study_data$videos),
    n_video = 1:study_data$videos,
    n_chapter = NA
  )
  for(i in seq_len(study_data$videos)){
    df$n_chapter[i] <- generate_random_number()
  }
  df <- multiply_out(df, n_column = "n_chapter", column_name = "chapter")
 
   # Simulate number of turns within each chapter
  for(i in seq_len(study_data$videos)){
    df$n_turns[i] <- generate_random_number(mean = 11, sd = 6, 
                                           min = 4, max = 40)
  }
  df <- multiply_out(df, n_column = "n_turns", column_name = "turn")
  
  # STIMULUS LEVEL VARIABLES
  df <- df %>% 
    group_by(name, chapter) %>% 
    mutate(
      chapter_length = generate_random_number(mean = 45, sd = 6, 
                                              min = 18, max = 120,
                                              digits = 3),
      turn_length = {raw_turn_lengths <- runif(n(), min = 4, max = 40)
                     scaled_turn_lengths <- raw_turn_lengths / sum(raw_turn_lengths) *
                      chapter_length
                     round(scaled_turn_lengths, 3)
                    },
      start_time = cumsum(lag(turn_length, default = 0)),
      end_time = cumsum(turn_length),
      turns_from_inference = n() - row_number() + 1,
      turn_percent_through_chapter = (row_number() / n()) * 100,
      time_percent_through_chapter = end_time/chapter_length * 100,
      speaker = ifelse(rep(sample(c(TRUE, FALSE), 1), n()), 
                         rep(c("Partner", "Target"), length.out = n()), 
                         rep(c("Target", "Partner"), length.out = n()))  %>% 
        factor(),
      sem_sim = {
        repeat {
          base_random <- runif(n(), min = -1.00, max = 1.00)
          weight <- ifelse(speaker == "Partner",
                           ((turn_percent_through_chapter - 1) / 180)^2, 
                           ((turn_percent_through_chapter - 1) / 120)^2) 
          noise <- ifelse(speaker == "Partner",
                          rnorm(n(), mean = 0, sd = 0.3),  
                          rnorm(n(), mean = 0, sd = 0.1))  
          sem_sim_raw <- base_random * (1 - weight) + 1 * weight + noise
          if (sum(sem_sim_raw <= -0.99 | sem_sim_raw >= 0.99) / n() < 0.05) {
            break
          }
        }
       
        pmin(pmax(sem_sim_raw, -1.00), 1.00)
      },
    cog_processing_language = sem_sim + rnorm(n(), mean = 0, sd = sqrt(1 - 0.45^2)),
    memory_language = sem_sim * 0.20 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.20^2)),
    emo_anxious_language = sem_sim * 0.10 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.04^2)),
    emo_sad_language = sem_sim * 0.15 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.02^2)),
    emo_anger_language = sem_sim * 0.19 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.09^2)),
    certain_language = sem_sim * 0.17 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.07^2)),
    self_ref_language = sem_sim * 0.21 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.21^2)),
    curious_language = sem_sim * 0.10 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.10^2)),
    vrm = sample(vrm, n(), replace = TRUE)
  )
 
  
   # PARTICIPANT-LEVEL VARIABLES
  if(paradigm == "DI"){
    df <- df %>% 
      mutate(
        target = paste0(name, "_target_", n_video),
        perceiver = paste0(name, "_perceiver_", n_video),
        partner = paste0(name, "_partner_", n_video),
        paradigm = "Dyadic Interaction"
      )
  } else if (paradigm == "SS"){
    # have to double-up on the naming because nesting removes the grouping column
    df <- df %>% 
      mutate(
        name2 = name
      )
    df_list <- df %>% 
      group_by(name) %>% 
      nest() 

    out_list <- list()
    
    for(i in seq_len(n_perceivers)){
      df_i <- sample(df_list$data, n_videos_per_perceiver) %>% 
          bind_rows()
      df_i <- df_i %>% 
        mutate(
          target = paste0(name2, "_target_", n_video),
          perceiver = paste0(name2, "_perceiver_", i),
          partner = paste0(name2, "_partner_", n_video),
          paradigm = "Standard Stimulus"
      )
      out_list[[i]] <- df_i
    }
    df <- bind_rows(out_list)
    df$name <- df$name2
    df <- df %>% 
      select(-name2)
  }
  return(df)
}
df <- list(
           stem = SimulateStudy("stem", paradigm = "DI"),
           barter = SimulateStudy("barter", paradigm = "DI"),
           deception_detection = SimulateStudy("deception_detection", paradigm = "DI"),
           new_moms = SimulateStudy("new_moms", 
                                    paradigm = "SS", 
                                    n_perceivers = 3, 
                                    n_videos_per_perceiver = 3),
           karyn_diss = SimulateStudy("karyn_diss", 
                                      paradigm = "SS", 
                                      n_perceivers = 212, 
                                      n_videos_per_perceiver = 3),
           murat_rep = SimulateStudy("karyn_diss", 
                                     paradigm = "SS", 
                                     n_perceivers = 200, 
                                     n_videos_per_perceiver = 3),
           mideast_men = SimulateStudy("mideast_men", 
                                       paradigm = "SS",
                                       n_perceivers = 326, 
                                       n_videos_per_perceiver = 4),
           double_empathy = SimulateStudy("double_empathy", 
                                          paradigm = "SS",
                                          n_perceivers = 100, 
                                          n_videos_per_perceiver = 4)
          ) %>% 
  bind_rows() %>% 
  ungroup()
## Warning: Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
df <- df %>% 
  mutate(across(where(is.character), factor))

Structural Hypotheses

Semsim increases closer to chapter end

avg_data <- df %>%
  group_by(turn_percent_through_chapter) %>%
  summarize(sem_sim = mean(sem_sim), .groups = "drop")

ggplot(df, aes(x = (turn_percent_through_chapter), y = sem_sim)) +
  geom_line(aes(group = perceiver), color = "gray", 
            alpha = 0.01, size = 0.5) +
  geom_hline(aes(yintercept = 0), color = "black") +
  geom_smooth(data = avg_data, aes(x = turn_percent_through_chapter, 
                                   y = sem_sim),
              method = "loess", se = FALSE, color = "black") +
  labs(
    title = "Turn Distance from Inference by Semantic Similarity",
    x = "Proximity to Inference",
    y = "Semantic Similarity",
    color = "Perceiver"
  ) +
  papaja::theme_apa(
    base_family = "Times New Roman"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Semsim increases for both Target and Partner as approaches inference but increases more for target than partner

avg_data <- df %>%
  group_by(turn_percent_through_chapter, speaker) %>%
  summarize(sem_sim = mean(sem_sim), .groups = "drop")

ggplot(df, aes(x = turn_percent_through_chapter, y = sem_sim)) +
  geom_line(aes(group = perceiver, color = speaker), 
            alpha = 0.005, size = 0.5) +
  scale_color_manual(
    values = c("Partner" = "red", "Target" = "blue"),  
    name = "Speaker"
  ) +
  geom_hline(aes(yintercept = 0), color = "black") +
  # Separate average lines for Target and Partner
  geom_smooth(data = avg_data %>% filter(speaker == "Target"), 
              aes(x = turn_percent_through_chapter, y = sem_sim),
              method = "loess", se = FALSE, color = "red") +
  geom_smooth(data = avg_data %>% filter(speaker == "Partner"),
            aes(x = turn_percent_through_chapter, y = sem_sim),
              method = "loess", se = FALSE, color = "blue") +
  labs(
    title = "Turn Distance from Inference by Semantic Similarity",
    x = "Proximity to Inference",
    y = "Semantic Similarity",
    color = "Speaker"
  ) +
  papaja::theme_apa(
    base_family = "Times New Roman"
  ) +
  theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Linguistic Figures

LanguageFigsFunction <- function(df, variable, var_name = "VARIABLE"){
  
  ggplot(df, aes(x = sem_sim, y = !!sym(variable))) +
    geom_point(color = "black", alpha = 0.01, size = 0.5) + 
    geom_smooth(color = "black", method = "lm", se = TRUE) +
    theme_apa(base_family = "Times New Roman") +
    labs(
      title = paste0("Correlation Between\n ", 
                     var_name, " and Semantic Similarity"),
      x = "Semantic Similarity",
      y = var_name,
      caption = paste0("Correlation = ",
                       round(cor(df["sem_sim"], df[variable]), 2))
  ) 
}
LanguageFigsFunction(df, variable = "cog_processing_language", "Cognitive Processing Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "memory_language", "Memory Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_anxious_language", "Anxious Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_sad_language", "Sad Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "emo_anger_language", "Anger Emotion")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "certain_language", "Certainty Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "self_ref_language", "Self-Referential Language")
## `geom_smooth()` using formula = 'y ~ x'

LanguageFigsFunction(df, variable = "curious_language", "Curiousity Language")
## `geom_smooth()` using formula = 'y ~ x'

df %>% 
  select(sem_sim, cog_processing_language, memory_language, 
         emo_anxious_language, emo_sad_language, emo_anger_language,
         self_ref_language, curious_language, certain_language) %>% 
  rename("Semantic Similarity" = sem_sim,
         'Cognitive Processing' = cog_processing_language, 
         'Memory' = memory_language, 
         'Anxious' = emo_anxious_language, 
         'Sad' = emo_sad_language,
         'Anger' = emo_anger_language, 
         'Certainty' = certain_language, 
         'Self-Referential' = self_ref_language, 
         'Curiousity' = curious_language) %>% 
  cor() %>% 
  kbl(digits = 2) %>% 
  kable_classic()
Semantic Similarity Cognitive Processing Memory Anxious Sad Anger Self-Referential Curiousity Certainty
Semantic Similarity 1.00 0.48 0.11 0.03 0.06 0.09 0.10 0.03 0.08
Cognitive Processing 0.48 1.00 0.05 0.02 0.02 0.05 0.06 -0.01 0.04
Memory 0.11 0.05 1.00 0.00 0.00 0.00 0.01 0.03 0.00
Anxious 0.03 0.02 0.00 1.00 -0.01 0.00 0.01 0.01 -0.02
Sad 0.06 0.02 0.00 -0.01 1.00 -0.01 0.02 0.01 0.02
Anger 0.09 0.05 0.00 0.00 -0.01 1.00 0.02 0.01 0.00
Self-Referential 0.10 0.06 0.01 0.01 0.02 0.02 1.00 -0.01 0.02
Curiousity 0.03 -0.01 0.03 0.01 0.01 0.01 -0.01 1.00 -0.02
Certainty 0.08 0.04 0.00 -0.02 0.02 0.00 0.02 -0.02 1.00